Introduction

In a prior post (LINK HERE), I began this project: a bit of a personal workshop in learning how to use Stan for spatial modeling. In that post, I did some exploratory data analysis to set up the overarching idea of the project, which is to (a) follow the example in LINK PAPER and use an ICAR model to model spatial dependencies in crashes across NYC and (b) potentially add a treatment/difference in difference component to this model to understand whether the congestion pricing policy has an effect on the number of vehicle crashes. In the interest of breaking this project into digestible chunks, I have decided that this, part 2 of 3, will focus only on the spatial modeling part. To that end, I’ll be basically following this GUIDE from Mitzi Morris.

MENTION INTERPOLATION

# Loading NYC Census Tracts: https://data.cityofnewyork.us/City-Government/2020-Census-Tracts/63ge-mke6/about_data 
nyc_tracts <- read.csv("data/2020_Census_Tracts_20250606.csv", stringsAsFactors = FALSE) %>%
  rename("geometry" = "the_geom")
nyc_tracts <- st_as_sf(nyc_tracts, wkt = "geometry", crs = 4326)
nyc_tracts <- nyc_tracts %>%
  mutate(
    area_sqm = Shape_Area / 27878400
  ) %>%
  select(
    geometry,
    BoroCT2020,
    BoroCode,
    BoroName,
    area_sqm,
    GEOID
  )

# Loading collisions data: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data 
filter <- c(0, NA)

crashes <- read.csv("data/Motor_Vehicle_Collisions_-_Crashes_20250605.csv") %>%
  select(CRASH.DATE,
         LATITUDE,
         LONGITUDE,
         NUMBER.OF.PERSONS.INJURED,
         NUMBER.OF.PERSONS.KILLED,
         NUMBER.OF.PEDESTRIANS.INJURED,
         NUMBER.OF.PEDESTRIANS.KILLED) %>%
  filter(! LATITUDE %in% filter,
         ! LONGITUDE %in% filter) %>%
  rename(
    "date" = "CRASH.DATE",
    "lat" = "LATITUDE",
    "long" = "LONGITUDE",
    "persons_inj" = "NUMBER.OF.PERSONS.INJURED",
    "persons_death" = "NUMBER.OF.PERSONS.KILLED",
    "ped_inj" = "NUMBER.OF.PEDESTRIANS.INJURED",
    "ped_death" = "NUMBER.OF.PEDESTRIANS.KILLED"
  ) %>%
  mutate(
    date = mdy(date)
    ) %>%
  st_as_sf(coords = c("long","lat"), crs = 4326)

# Grabbing some census variables via Tidycensus, using Keyring for API Key privacy
tidycensus_api_key <- key_get(service = "tidycensus_API", username = "my_tidycensus")
census_api_key(tidycensus_api_key)

census_vars <- get_acs(state = "NY",
                       county = c("Bronx", "Kings", "New York", "Queens", "Richmond"),
                       geography = "tract",
                       variables = c(medincome = "B19013_001",
                                     population = "B01003_001",
                                     median_age = "B01002_001",
                                     transport_baseline = "B08301_001",
                                     transport_pubtransit = "B08301_010"),
                       geometry = FALSE,
                       keep_geo_vars = FALSE,
                       year = 2022,
                       output = "wide"
                       ) %>%
  mutate(
    GEOID = as.numeric(GEOID),
    median_income = medincomeE,
    population = populationE,
    median_age = median_ageE,
    prop_pubtransit = transport_pubtransitE / transport_baselineE
  ) %>%
  select(
    GEOID,
    median_income,
    population,
    median_age,
    prop_pubtransit
  )

# Associating CP Zone, Pre/Post, Treatment, Borough, and Census Tract w/ Observations
crashes <- crashes %>%
  st_join(nyc_tracts, join = st_within) %>%
  filter(! is.na(area_sqm))

# Aggregating/summarizing data to census tract level, no time component
crashes_tract <- crashes %>%
  group_by(BoroCT2020) %>%
  summarize(tot_crashes = n(),
            area = mean(area_sqm)) %>%
  ungroup() %>%
  st_drop_geometry()

# Getting Fragmentation Index from: https://github.com/mitzimorris/geomed_2024/blob/main/data/nyc_study.geojson

frag_data = st_read(file.path("data", "nyc_study.geojson"), quiet = TRUE) %>%
  st_drop_geometry() %>%
  select(BoroCT2010, frag_index, traffic) %>%
  mutate(
    BoroCT2010 = as.numeric(BoroCT2010)
  )

# Joining everything together, selecting only variables that I want
crashes_tracts_geo <- nyc_tracts %>%
  right_join(crashes_tract, 
             by = "BoroCT2020") %>%
  left_join(census_vars,
            by = "GEOID") %>%
  left_join(frag_data,
            by = c("BoroCT2020" = "BoroCT2010")) %>%
  select(! c(area)) %>% 
  filter(! if_any(everything(), is.na))

# Interactive Data Table for Display
crashes_dt <- crashes_tracts_geo %>%
  st_drop_geometry() %>%
  mutate(
    area_sqm = round(area_sqm, 3),
    prop_pubtransit = round(prop_pubtransit, 3)
  )

datatable(crashes_dt,
          extensions = 'Buttons',
          filter = "top",
          rownames = FALSE,
          options = list(
            autoWidth = TRUE,
            scrollX = TRUE
          ),
  class = 'compact',
  escape = FALSE
) %>%
  formatStyle(
    columns = names(crashes_dt),
    `white-space` = "nowrap",
    `height` = "1.5em",
    `line-height` = "1.5em"
  )

More Plots

Histogram of Collisions per Square Mile

ggplot(data = crashes_tracts_geo, aes(x = tot_crashes)) + 
  geom_histogram(aes(y = after_stat(density)), 
                 bins = 500, 
                 color = "lightblue",
                 fill = "lightblue",
                 alpha = 0.75) + 
  geom_density(color = "#FFC600", linewidth = 1) + 
  theme_bw() + 
  labs(title = "Distribution Total Crashes per Census Tract (2024-25)",
       x = "Number of Crashes",
       y = "Density")

Choropleth Maps

Map of Collisions

crash_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "tot_crashes",
              fill.scale = tm_scale_intervals(values = "brewer.yl_or_rd",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Total Crashes, 2024-25"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right") 

crash_map

Predictor Maps

Median Income // Median Age

income_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "median_income",
              fill.scale = tm_scale_intervals(values = "brewer.greens",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Median Income"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

medianage_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "median_age",
              fill.scale = tm_scale_intervals(values = "brewer.blues",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Median Age"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(income_map,
             medianage_map,
             nrow = 1, ncol = 2)

Proportion Using Public Transit to Commute // Population

prop_pubtransit_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "prop_pubtransit",
              fill.scale = tm_scale_intervals(values = "brewer.purples",
                                              style = "jenks",
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Proportion that Commute Using Public Transit"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

pop_per_sqm_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "population",
              fill.scale = tm_scale_intervals(values = "brewer.yl_gn",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Population"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(prop_pubtransit_map,
             pop_per_sqm_map,
             nrow = 1, ncol = 2)

Fragmentation Index // Traffic

frag_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "frag_index",
              fill.scale = tm_scale_intervals(values = "brewer.pu_or",
                                              style = "quantile",
                                              midpoint = 0,
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Fragmentation Index"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

traffic_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "traffic",
              fill.scale = tm_scale_intervals(values = "brewer.reds",
                                              style = "quantile",
                                              midpoint = 0,
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Traffic"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(frag_map,
             traffic_map,
             nrow = 1, ncol = 2)

Moving from a Map to Spatial Dependencies

Computing Properties of the Node Graph

In the output below, we can see that there is 1 region with 0 links (an island), and 4 disjoint subgraphs, meaning networks that there are no paths between (Staten Island is a disjoint subgraph). This was done using the Queen’s case method (think of the squares that a Queen can reach on a chessboard–not only directly bordering neighbors, but diagonals as well.

Morris also recommends checking the rook’s case, so I’m outputing the results from both below:

nyc_nbs_rook = poly2nb(crashes_tracts_geo, queen=FALSE)
## Warning in poly2nb(crashes_tracts_geo, queen = FALSE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(crashes_tracts_geo, queen = FALSE): neighbour object has 8 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
cat("*** Queen metric nb summary ***\n\n")
## *** Queen metric nb summary ***
summary(nyc_nbs)
## Neighbour list object:
## Number of regions: 1956 
## Number of nonzero links: 10788 
## Percentage nonzero weights: 0.2819702 
## Average number of links: 5.515337 
## 2 regions with no links:
## 259, 740
## 7 disjoint connected subgraphs
## Link number distribution:
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13 
##   2  18  53 143 286 466 464 314 149  39  12   6   2   2 
## 18 least connected regions:
## 55 257 521 581 582 644 835 839 893 1210 1249 1402 1490 1576 1591 1605 1635 1893 with 1 link
## 2 most connected regions:
## 1718 1785 with 13 links
cat("\n\n*** Rook metric nb summary ***\n\n")
## 
## 
## *** Rook metric nb summary ***
summary(nyc_nbs_rook)
## Neighbour list object:
## Number of regions: 1956 
## Number of nonzero links: 9052 
## Percentage nonzero weights: 0.2365957 
## Average number of links: 4.627812 
## 3 regions with no links:
## 259, 740, 835
## 8 disjoint connected subgraphs
## Link number distribution:
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12 
##   3  23  77 283 513 599 311  92  38   9   5   2   1 
## 23 least connected regions:
## 55 104 257 391 521 581 582 644 839 893 1210 1249 1402 1421 1453 1490 1518 1576 1591 1605 1635 1726 1893 with 1 link
## 1 most connected region:
## 1718 with 12 links

So…really not much difference. About 1 link, on average, separates the connections for any given tract using the Queen’s case vs. the Rook’s.

Cleaning Neighbor Graph Objects

NEED TO ADD MORE LINKS! OR FIGURE OUT MISSINGNESS

Exactly what was needed in this department was a touch confusing to me. Essentially, I got the idea that (a) I should examine whether existing links in the graph made sense (e.g., if there’s a link across water, does it make sense to keep it in from a pedestrian-focused context?) and then (b) make sure the final graph is fully connected, because that’s a requirement of the ICAR model I want to use. In Morris’s example it seems that part (a) was mostly an exercise in spatial data science, whereas the actual dataset used for the final analysis is the original graph with all of its cross-water connections and added links to make it a single component. The code below achieves this, and checks my work.

Checking My Work

cat('is symmetric? ', is.symmetric.nb(nyc_nbs_connected), '\n')
## is symmetric?  TRUE
summary(nyc_nbs_connected)
## Neighbour list object:
## Number of regions: 1956 
## Number of nonzero links: 10800 
## Percentage nonzero weights: 0.2822839 
## Average number of links: 5.521472 
## 7 disjoint connected subgraphs
## Link number distribution:
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13 
##  19  52 144 287 461 469 313 150  39  12   6   2   2 
## 19 least connected regions:
## 55 257 259 521 581 582 644 835 839 893 1210 1249 1402 1490 1576 1591 1605 1635 1893 with 1 link
## 2 most connected regions:
## 1718 1785 with 13 links
cat('nodes per component')
## nodes per component
table(n.comp.nb(nyc_nbs_connected)$comp.id)
## 
##    1 
## 1956

Model Building Workflow

This is where this notebook gets long. I’m going to document my whole process for iteratively fitting better and better models using Stan.

poisson_model_file = file.path('stan', 'poisson_1.stan')
cat(readLines(poisson_model_file), sep="\n")
## data {
##   int<lower=0> N;
##   array[N] int<lower=0> y; // count outcomes
##   vector<lower=0>[N] E; // exposure
##   int<lower=1> K; // num covariates
##   matrix[N, K] xs; // design matrix
## }
## transformed data {
##   vector[N] log_E = log(E);
## }
## parameters {
##   real beta0; // intercept
##   vector[K] betas; // covariates
## }
## model {
##   y ~ poisson_log(log_E + beta0 + xs * betas);
##   beta0 ~ std_normal();
##   betas ~ std_normal();
## }
## generated quantities {
##   array[N] int y_rep;
##   vector[N] log_lik;
##   {
##     vector[N] eta = log_E + beta0 + xs * betas;
##     y_rep = max(eta) < 26 ? poisson_log_rng(eta) : rep_array(-1, N);
##     for (n in 1:N) {
##       log_lik[n] = poisson_log_lpmf(y[n] | eta[n]);
##     }
##   }
## }

So, this is not my model, it’s the simple Poisson model that Morris uses. So, rather than understanding it because I wrote it, I’m going to understand it because I explain it to you in this section. First, the data: 1. N: The number of observations (in this case, the number of census tracts) 2. y: The outcome measure, number of crashes in 2024-25 3. E: The “exposure” variable, which is the population of each tract. I believe this is separated from the rest of the predictor matrix, xs, so that it can be log transformed and then used as a separate component in the linear predictor. 4. K: The number of predictor variables 5. xs: A matrix containing N rows and K columns, which holds the predictor variables

The transformed data block takes the log of population (E), which is a generally good principle for data that’s constrained to be all positive. It also means that the coefficient goes to the multiplicative scale, which is nice for interpretation.

In the parameters block, Morris defines an intercept, beta0, and a vector of coefficients, betas, for the covariates. The beta0 coefficient is split out so it can be applied to the log_E data.

The model specifies that y comes from a Poisson distribution, and gives beta0 and betas simple standard normal distributions (mean 0, sd 1).

Lastly, the generated quantities block. In Morris’s words, this is for use in model checking and model comparison. Y_rep is used to hold replicated data. Morris uses some optimization in this block that I don’t entirely (or even really) understand, but this will be used to conduct posterior predictive checks. That is, we can simulate fake data and check their characteristics (mean, sd, quantiles) to see if the model does a good job. Morris uses leave-one-out cross validation, and uses the output from the loo() function–Expected Log Predictive Density (ELPD)–which gives the log-probability of new data given the model. A higher ELPD is better.

design_mat <- as.data.frame(crashes_tracts_geo) %>%
  select(prop_pubtransit, median_income, median_age, frag_index, traffic) %>%
  mutate(median_income = log(median_income),
         traffic = log(traffic))

pois_data <- list(
  N = nrow(crashes_tracts_geo),
  y = as.integer(crashes_tracts_geo$tot_crashes),
  E = as.integer(crashes_tracts_geo$population),
  K = ncol(design_mat),
  xs = design_mat
)
poisson_model_1 <- cmdstan_model(poisson_model_file)
set.seed(50)

poisson_fit_1 <- poisson_model_1$sample(data=pois_data, parallel_chains=cores, refresh=0)
## Running MCMC with 4 chains, at most 6 in parallel...
## 
## Chain 4 finished in 9.9 seconds.
## Chain 1 finished in 10.6 seconds.
## Chain 3 finished in 10.7 seconds.
## Chain 2 finished in 11.9 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 10.8 seconds.
## Total execution time: 12.1 seconds.
print(poisson_fit_1)
##  variable      mean    median    sd   mad        q5       q95 rhat ess_bulk
##  lp__     283546.16 283546.00  1.76  1.48 283543.00 283548.00 1.00     1598
##  beta0        -6.41     -6.41  0.09  0.09     -6.56     -6.25 1.00     1065
##  betas[1]     -0.20     -0.20  0.03  0.02     -0.24     -0.16 1.00     1099
##  betas[2]     -0.03     -0.03  0.01  0.01     -0.04     -0.02 1.00     1248
##  betas[3]     -0.01     -0.01  0.00  0.00     -0.01     -0.01 1.00     4725
##  betas[4]      0.00      0.00  0.00  0.00      0.00      0.01 1.00     2457
##  betas[5]      0.26      0.26  0.00  0.00      0.26      0.27 1.00     1814
##  y_rep[1]     29.84     30.00  5.46  5.93     21.00     39.00 1.00     3761
##  y_rep[2]     54.56     55.00  7.29  7.41     43.00     67.00 1.00     3293
##  y_rep[3]    102.82    103.00 10.03 10.38     86.00    120.00 1.00     4129
##  ess_tail
##      2082
##      1509
##      1723
##      1831
##      3012
##      2464
##      2146
##      3700
##      3827
##      3909
## 
##  # showing 10 of 3919 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

Refinement: Mean-Center Predictor Data